rm(list=ls())
source("functions.R")
quanteda_options(threads = 8)

dict <- dictionary(file = "topic_candidate.yml")
toks <- readRDS("data/data_tokens_sent.RDS") %>% tokens_subset(year >= 1991)
toks <- tokens_compound(toks, dict, join = FALSE, concatenator = " ")

pred <- list()

# Original
pred[["knowledge"]] <- local({
  # Train
  mt_key <- dfm(tokens_lookup(toks, dict[["knowledge"]], levels = 1))
  mt <- dfm(toks, remove = c("", stopwords("en"))) %>% 
    dfm_trim(min_termfreq = 10)
  map <- textmodel_newsmap(mt, mt_key)
  
  # Test
  mt_coded <- dfm_subset(mt, !is.na(topic_human) & ntoken(mt) > 0)
  dat <- docvars(mt_coded)
  dat$topic <- predict(map, newdata = mt_coded)
  return(dat)
})

pred[["frequency"]] <- local({
  # Train
  mt_key <- dfm(tokens_lookup(toks, dict[["frequency"]], levels = 1))
  mt <- dfm(toks, remove = c("", stopwords("en"))) %>% 
    dfm_trim(min_termfreq = 10)
  map <- textmodel_newsmap(mt, mt_key)

  # Test
  mt_coded <- dfm_subset(mt, !is.na(topic_human) & ntoken(mt) > 0)
  dat <- docvars(mt_coded)
  dat$topic <- predict(map, newdata = mt_coded)
  return(dat)
})

pred[["all"]] <- local({
  # Train
  mt_key <- dfm(tokens_lookup(toks, dict, levels = 2))
  mt <- dfm(toks, remove = c("", stopwords("en"))) %>% 
    dfm_trim(min_termfreq = 10)
  map <- textmodel_newsmap(mt, mt_key)
  
  # Test
  mt_coded <- dfm_subset(mt, !is.na(topic_human) & ntoken(mt) > 0)
  dat <- docvars(mt_coded)
  dat$topic <- predict(map, newdata = mt_coded)
  return(dat)
})

saveRDS(pred, "data_prediction.RDS")


